from scraper.utils.html import *
from py2neo import Graph, Node, Relationship, NodeMatcher

qut_graph = Graph(
    'http://localhost:7474',
    username='neo4j',
    password='00001011'
)

matcher = NodeMatcher(qut_graph)

qut_node = matcher.match('学校', name='青岛理工大学').first()


def create_person_node(name, key, attrs):
    person_nodes = matcher.match('人')
    neo_node = None

    for person in person_nodes:
        if person['name'] == name:
            neo_node = person
            break

    if neo_node:
        print('命中：', neo_node['name'])

        if key and not neo_node[key]:
            print('添加数据')
            neo_node[key] = attrs[key]
    else:
        print('创建：', name)
        neo_node = Node('人', **attrs)

    return neo_node


# 名师风采
def insert_qut_teachers():
    url = 'http://www.qtech.edu.cn/szdw/msfc.htm'

    req = requests.get(url)
    bs = BeautifulSoup(req.content, 'html.parser')

    titles = list(map(lambda node: re.sub(r'\s', '', node.text), bs.find_all(class_='jszn45')))
    contents = bs.find_all(class_='jsznlist')

    prefix = 'http://www.qtech.edu.cn/'
    index = 0
    for node_list in contents:
        nodes = node_list.find_all('a')
        for node in nodes:
            desc = ''
            if node.attrs['href'] != '#':
                desc_url = prefix + re.sub(r'\.\./', '', node.attrs['href'])
                desc = BeautifulSoup(requests.get(desc_url).content, 'html.parser').find(class_='mchcot').text
            title = titles[index]
            name_info = re.split(r'[（）]', re.sub(r'\s', '', node.text))
            name = name_info[0]

            attrs = {
                'name': name,
                'info': desc,
            }
            neo_node = create_person_node(name, 'info', attrs)
            if neo_node:
                print('创建关系：', title)
                qut_graph.create(Relationship(neo_node, title, qut_node))

        index += 1


# 机械学院
def insert_jx_teachers():
    school_node = matcher.match('学院', name='机械与汽车工程学院').first()

    with open('机械学院.txt', 'r') as file:
        org = ''

        line = file.readline()
        while line:
            if re.match(r'\d+\.\w+', line):
                org = re.sub(r'[\d.\s]', '', line)
                print('命中所属：', org)
            else:
                group = re.split(r'[：\s]', line)
                title = group[0]
                for person in group[1: -1]:
                    if not re.search(r'\s', person) and person != '':
                        attrs = {
                            'name': person,
                            'org': org,
                        }
                        neo_node = create_person_node(person, 'org', attrs)
                        qut_graph.create(Relationship(neo_node, title, school_node))
                        print('创建关系：', title, person)

            line = file.readline()


# 商学院
def insert_sxy_teachers():
    school_node = matcher.match('学院', name='商学院').first()

    with open('商学院.txt', 'r') as file:
        org = ''

        line = file.readline()
        while line:
            if re.match(r'\w+系', line):
                org = re.sub(r'\s', '', line)
                print('命中所属：', org)
            else:
                for person in re.split(r'\s+', line):
                    if not re.search(r'\s', person) and person != '':
                        attrs = {
                            'name': person,
                            'org': org,
                        }
                        neo_node = create_person_node(person, 'org', attrs)
                        qut_graph.create(Relationship(neo_node, '成员', school_node))
                        print('创建关系：', person)

            line = file.readline()


# 根据分页器获取页面总数
# 参数 bs4对象
# 数字 页面总数
def get_page_num(html, id):
    pagination = html.find(id=id)
    start_index = re.search(r'/\d+', pagination.text).span()[0]
    return int(pagination.text[start_index + 1:])


# 土木学院
def insert_tm_teachers():
    school_node = matcher.match('学院', name='土木工程学院').first()

    base_url = 'http://civil.qut.edu.cn'
    start_url = 'http://civil.qut.edu.cn/szdw/jsmc.htm'
    page_list = parse_pagination('%s/' % start_url.split('.htm')[0], '.htm', get_page_num(get_html(start_url), 'fanye183443'))
    page_list.append(start_url)

    index = 0
    for page_url in page_list:
        article_list = get_html(page_url).find_all(class_='copy')
        for article_url in get_article_urls(article_list):
            article_url = re.sub(r'(\.\./)+', '', article_url)
            print('正在爬取[%d]: %s' % (index, article_url))

            html = get_html('%s/%s' % (base_url, article_url))
            if parse(html.find('p')):
                name = re.sub(r'\s', '', parse(html.find('p')))
                if len(name) <= 8:
                    print(name)
                    info = parse(html.find('form'))
                    attrs = {
                        'name': name,
                        'info': info,
                    }
                    neo_node = create_person_node(name, 'info', attrs)
                    qut_graph.create(Relationship(neo_node, '教师', school_node))
                    print('创建关系：', name)
            else:
                print('没有信息')

        index += 1


# 环境学院
def insert_hj_teachers():
    school_node = matcher.match('学院', name='环境与市政工程学院').first()

    base_url = 'http://hjxy.qut.edu.cn'
    start_url = 'http://hjxy.qut.edu.cn/szdw/szgcx.htm'
    pages = []
    for node in get_html(start_url).find(id='left_nav_sub_0').find_all('a'):
        pages.append({
            'url': '%s/szdw/%s' % (base_url, node.attrs['href']),
            'org': node.text,
        })

    index = 0
    for page in pages:
        print(page['org'])
        article_list = get_html(page['url']).find_all(class_='li2')
        for article_url in get_article_urls(article_list):
            article_url = re.sub(r'(\.\./)+', '', article_url)
            print('正在爬取[%d]: %s' % (index, article_url))

            html = get_html('%s/%s' % (base_url, article_url))
            if parse(html.find('table')):
                name = re.sub(r'个人简[历介]\s', '', parse(html.find(class_='press')))
                print(name)
                info = html.find(id='vsb_content').text
                attrs = {
                    'name': name,
                    'info': info,
                    'org': page['org'],
                }
                neo_node = create_person_node(name, 'info', attrs)
                qut_graph.create(Relationship(neo_node, '教师', school_node))
                print('创建关系：', name)

            else:
                print('没有信息')

        index += 1


# 马克思学院
def insert_mks_teachers():
    school_node = matcher.match('学院', name='马克思主义学院').first()

    base_url = 'http://marxism.qut.edu.cn'
    start_url = 'http://marxism.qut.edu.cn/szdw/jsfc.htm'
    pages = []
    for node in get_html(start_url).find_all(class_='list_page_t'):
        pages.append({
            'url': '%s/%s' % (base_url, re.sub(r'\.\./', '', node.attrs['href'])),
            'name': node.text,
        })

    index = 0
    for page in pages:
        html = get_html(page['url'])
        if parse(html.find(class_='box2-content')):
            info = html.find(id='content_text').text
            attrs = {
                'name': page['name'],
                'info': info,
            }
            neo_node = create_person_node(page['name'], 'info', attrs)
            qut_graph.create(Relationship(neo_node, '教授', school_node))
            print('创建关系：', page['name'])

        else:
            print('没有信息')


# 理学院
def insert_lxy_teachers():
    school_node = matcher.match('学院', name='理学院').first()

    base_url = 'http://lxy.qut.edu.cn'
    start_url = 'http://lxy.qut.edu.cn/xygk/szdw.htm'
    page_list = parse_pagination('%s/' % start_url.split('.htm')[0], '.htm', get_page_num(get_html(start_url), 'fanye201244'))
    page_list.append(start_url)

    index = 0
    for page_url in page_list:
        print(page_url)
        article_list = get_html(page_url).find(id='cms_list').find_all('li')
        for article_url in get_article_urls(article_list):
            article_url = re.sub(r'(\.\./)+', '', article_url)
            print('正在爬取[%d]: %s' % (index, article_url))

            html = get_html('%s/%s' % (base_url, article_url))
            if parse(html.find('h1')):
                text = parse(html.find('h1'))
                group = re.split(r'[【】副教授博士硕导]', re.sub(r'\s', '', text))
                if len(group) > 2:
                    org = group[1]
                    name = group[2]
                    match_result = re.search(r'副*教授|硕导|博士', text)
                    if match_result:
                        spans = match_result.span()
                        title = text[spans[0]: spans[1]]

                        info = parse(html.find(id='vsb_content'))
                        attrs = {
                            'name': name,
                            'info': info,
                            'org': org,
                            'title': title,
                        }
                        neo_node = create_person_node(name, 'info', attrs)
                        qut_graph.create(Relationship(neo_node, title, school_node))
                        print('创建关系：', name)
                else:
                    print('没有信息')

        index += 1


# 管理学院
def insert_glxy_teachers():
    school_node = matcher.match('学院', name='管理工程学院').first()

    base_url = 'http://glxy.qut.edu.cn'
    start_url = 'http://glxy.qut.edu.cn/szdw/jsml.htm'
    page_list = parse_pagination('%s/' % start_url.split('.htm')[0], '.htm', get_page_num(get_html(start_url), 'fanye200024'))
    page_list.append(start_url)

    index = 0
    for page_url in page_list:
        print(page_url)
        article_list = get_html(page_url).find(class_='winstyle200024').find_all(class_='c200024')
        for article_url in get_article_urls(article_list):
            article_url = re.sub(r'(\.\./)+', '', article_url)
            print('正在爬取[%d]: %s' % (index, article_url))

            html = get_html('%s/%s' % (base_url, article_url))
            if parse(html.find('h1')):
                text = parse(html.find('h1'))
                name = re.split(r'[副教授]', re.sub(r'\s', '', text))[0]
                match_result = re.search(r'副*教授|硕导|博士', text)
                if match_result:
                    spans = match_result.span()
                    title = text[spans[0]: spans[1]]
                    info = parse(html.find(id='vsb_content'))
                    attrs = {
                        'name': name,
                        'info': info,
                        'title': title,
                    }
                    neo_node = create_person_node(name, 'info', attrs)
                    qut_graph.create(Relationship(neo_node, title, school_node))
                    print('创建关系：', name)
                else:
                    print('没有信息')

        index += 1


# 信控学院
def insert_xk_teachers():
    school_node = matcher.match('学院', name='信息与控制工程学院').first()

    with open('信控学院.txt', 'r') as file:
        org = ''

        line = file.readline()
        while line:
            if re.match(r'-\w+$', line):
                org = re.sub(r'[-\s]', '', line)
                print('命中所属：', org)
            else:
                name = re.sub(r'\s', '', line);
                attrs = {
                    'name': name,
                    'org': org,
                }

                neo_node = create_person_node(name, 'org', attrs)
                qut_graph.create(Relationship(neo_node, '成员', school_node))
                print('创建关系：', name)

            line = file.readline()


